Loading and cleaning data

This file contains time-series plots from the SoS database.

First, we load the SoS database and process the date columns.

# knitr::opts_chunk$set(message = FALSE)
library(plyr)
library(dplyr)
library(magrittr)
library(lubridate)
library(ggplot2)
library(devtools)

load_all()

# source("inst/scripts/1-load-and-clean.R", verbose = TRUE)
data(sos_raw)

sosid <- paste0("SOS", 1:nrow(sos_raw))

date_created <- clean_date_created(sos_raw)
date_terminated <- clean_date_terminated(sos_raw)

Processing data

Next, we create a data frame and use the lubridate package to create an interval entity_type for each system.

sos_dates <- data.frame(date_created, date_terminated)
sos_dates$active_interval <- new_interval(sos_dates$date_created, sos_dates$date_terminated)

We then create a separate data frame with a column of every year from 1900 to present day. Using that column, we create another column, where, for each year, we sum the number of surveillance systems whose interval encompasses that year.

I wrote a short function that takes as its arguments a year and a vector of lubridate intervals, and returns the number of years that fall within those intervals.

sum_active_systems_for_year <- function(year, intervals) {
  sum(year %within% intervals, na.rm = TRUE)
}

time_series <- data.frame(year = parse_date_time(1900:2015, orders = "y"))

time_series$number_active <- sapply(time_series$year,
                                    sum_active_systems_for_year,
                                    intervals = sos_dates$active_interval)

# # We could also run this:
# time_series %>%
#   group_by(year) %>%
#   mutate(number_active = sum(year %within% sos_dates$active_interval, na.rm = TRUE)) %>%
#   ungroup()

Plots

We can then plot this.

# Quick-and-dirty qplot.
qplot(x = year, y = number_active, data = time_series, geom = "line")

# Cleaner plot using ggplot().
ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time")

# With number of systems on a log scale.
ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems (log scale)", title = "Number of Active Surveillance Systems over Time") + scale_y_log10()

Splitting by Other Variables

First we're going to split and color the plot by entity type.

This is fucking annoying, as you have to iterate over the levels of entity_type and the values of year.

sos_dates$entity_type <- clean_entity_type(sos_raw, return_type = "factor")

time_series_entity <- expand.grid(year = time_series$year, entity_type = levels(sos_dates$entity_type))

time_series_entity %<>%
  group_by(entity_type, year) %>%
  mutate(number_active = sum_active_systems_for_year(year,
                                                     sos_dates[sos_dates$entity_type == entity_type,
                                                               "active_interval"])) %>%
  ungroup()
ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_area() +
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)")

ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_area() +
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)")

ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_area(position = "fill") +
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)")


ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_area(position = "identity", alpha = 0.25) +
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type")


ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_area(position = "identity", alpha = 0.25) +
  theme_bw() +
  scale_y_log10() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type")



ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_line() + 
  theme_bw() +
  labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (line)")


ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + 
  geom_line() + 
  theme_bw() +
  scale_y_log10() + 
  labs(x = "Year", y = "Count of Active Systems (log-transformed)", title = "Number of Active Surveillance Systems over Time by Entity Type (line)")

Date Created

library(reshape2)
qplot(year(sos_dates$date_created), binwidth = 1)
qplot(year(sos_dates$date_terminated), binwidth = 1)

date_hist <- data.frame(sosid, select(sos_dates, date_created, date_terminated)) %>%
  melt(id.vars = "sosid") %>%
  mutate(year = year(value)) %>%
  filter(year != 2015, year >= 1950)

ggplot() +
  geom_histogram(data = filter(date_hist, variable == "date_created"),
                 mapping = aes(x = year, y = ..count.., fill = "Created"), binwidth = 1) +
  geom_histogram(data = filter(date_hist, variable == "date_terminated"),
                 mapping = aes(x = year, y = -..count.., fill = "Terminated"), binwidth = 1) +
  scale_fill_hue("Group") + theme_bw() +
  labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")
# ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated.pdf", width = 6.5, height = 4.5)
# ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated.png", width = 6.5, height = 4.5)

date_hist2 <- date_hist %>%
  group_by(year, variable) %>%
  summarize(count = n()) %>%
  dcast(year ~ variable)

years_not_in_df <- seq(1950, 2014)[!seq(1950, 2014) %in% date_hist2$year]

date_hist2 <- rbind(date_hist2, data.frame(year = years_not_in_df, date_created = NA, date_terminated = NA))

date_hist2[is.na(date_hist2)] <- 0

ggplot(data = date_hist2, aes(x = year)) +
  geom_bar(aes(y = date_created), stat = "identity", fill = "#F8766D") +
  geom_bar(aes(y = -date_terminated), stat = "identity", fill = "#00BFC4") +
  geom_line(aes(y = date_created - date_terminated), size = 0.5) +
  theme_bw() +
  labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")
# ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated (with over-under line).pdf", width = 6.5, height = 4.5)
# ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated (with over-under line).png", width = 6.5, height = 4.5)



ggplot() +
  geom_histogram(data = filter(date_hist, variable == "date_created"),
                 mapping = aes(x = year, y = ..count.., fill = "Created"), binwidth = 1) +
  geom_histogram(data = filter(date_hist, variable == "date_terminated"),
                 mapping = aes(x = year, y = -..count.., fill = "Terminated"), binwidth = 1) +
  geom_line(data = time_series, mapping = aes(x = year(year), y = number_active)) +
  scale_fill_hue("Group") + theme_bw() +
  labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")


ecohealthalliance/sos documentation built on May 15, 2019, 7:56 p.m.